The data contains features extracted from the silhouette of vehicles in different angles. Four "Corgie" model vehicles were used for the experiment: a double decker bus, Cheverolet van, Saab 9000 and an Opel Manta 400 cars. This particular combination of vehicles was chosen with the expectation that the bus, van and either one of the cars would be readily distinguishable, but it would be more difficult to distinguish between the cars.
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder # Label encoder
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn import metrics
from scipy.stats import skew
from sklearn.decomposition import PCA
from sklearn import svm
from sklearn.model_selection import KFold
data = pd.read_csv('vehicle.csv')
data.head(5)
data.info()
data.isnull().sum() # checking columns which has missing values with missing data count per column
ax = sns.countplot(x="class", data=data, palette="pastel")
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
for p in ax.patches:
ax.annotate('{}'.format(p.get_height()), (p.get_x()+0.1, p.get_height()+5), ha='center')
plt.show()
Car count is greater in number compare to the Van and Bus count. Van count is least in number.
labelencoder = LabelEncoder()
data['class'] = labelencoder.fit_transform(data['class'])
data['class'].value_counts()
data.describe().T
# dropping rows which has more than 1 null value
null_values_indexs = []
for i in range(len(data.index)) :
if data.iloc[i].isnull().sum() > 1 :
print("Nan in row ", i , " : " , data.iloc[i].isnull().sum())
null_values_indexs.append(i)
print(f'\nDroping rows: {null_values_indexs}')
data.drop(null_values_indexs, inplace=True)
# replacing missing values in median of respective column.
for col in data.columns:
data[col].fillna(value= data[col].median(), inplace = True)
data.isnull().sum() # checking columns which has missing values with missing data count per column
pos = 1 # a variable to manage the position of the subplot in the overall plot
for feature in data.columns: # for-loop to iterate over every attribute whose distribution is to be visualized
if pos == 1:
plt.figure(figsize= (30,20)) # Set the figure size
plt.subplot(3, 4, pos) # plot grid
if feature != 'class': # Plot histogram for all the continuous columns
sns.distplot(data[feature], kde= True )
else:
sns.countplot(data[feature], palette= 'Blues') # Plot bar chart for all the categorical columns
pos += 1 # to plot over the grid one by one
if pos > 4:
pos = 1
peak_ponts = {
'circularity': 'Total 2 peaks.',
'distance_circularity': 'Total 2 peaks.',
'radius_ratio': 'Total 2 peaks.',
'max.length_aspect_ratio': 'Total 2 peaks.',
'scatter_ratio': 'Total 2 peaks.',
'pr.axis_rectangularity': 'Total 2 peaks.',
'scaled_variance': 'Total 2 peaks.',
'scaled_variance.1': 'Total 2 peaks.',
'hollows_ratio': 'Total 2 peaks.',
}
plt.figure(figsize=(15,15))
sns.boxplot(data=data, orient='h')
# skewness = 0 : normally distributed.
# skewness > 0 : more weight in the right tail of the distribution.
# skewness < 0 : more weight in the left tail of the distribution.
for col in data.columns:
skewness = skew(data[col])
# peaks, _ = find_peaks(data[col], height=7)
label = ''
if skewness == 0:
label = 'Normally distributed.'
elif skewness > 0:
label = 'More weight in the right tail of the distribution.' # right skewed
elif skewness < 0:
label = 'More weight in the left tail of the distribution.' # left skewed
if col in peak_ponts:
peak_label = peak_ponts[col]
else:
peak_label = ''
print(f'- Skewness of {col} is {skewness}. {label} { peak_label } \n')
plt.figure(figsize = (20,15))
sns.set_style(style = 'white')
g = sns.heatmap(data.corr(), annot=True, cmap = 'summer_r', square=True, linewidth=1, cbar_kws={'fraction' : 0.02})
g.set_yticklabels(g.get_yticklabels(), rotation=0, horizontalalignment='right')
bottom, top = g.get_ylim()
g.set_ylim(bottom + 0.5, top - 0.5)
# Create correlation matrix
corr_matrix = data.corr().abs()
# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
mask = upper == 0 # to mask the upper triangle in the following heatmap
plt.figure(figsize = (15,8)) # setting the figure size
sns.set_style(style = 'white') # Setting it to white so that we do not see the grid lines
g = sns.heatmap(upper, center=0.5, cmap= 'summer_r', annot= True, xticklabels = corr_matrix.index,
yticklabels = corr_matrix.columns, cbar= False, linewidths= 1, mask = mask) # Da Heatmap
g.set_yticklabels(g.get_yticklabels(), rotation=0, horizontalalignment='right')
bottom, top = g.get_ylim()
g.set_ylim(bottom + 0.5, top - 0.5)
plt.xticks(rotation = 50) # Aesthetic purposes
plt.yticks(rotation = 20) # Aesthetic purposes
plt.show()
# Find index of feature columns with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
print(f'We can drop {to_drop} \n')
# Correlation with Target
predictors = data.drop('class', axis=1)
predictors.corrwith(data['class']).plot.bar(figsize = (14, 6), title = "Correlation with Target", fontsize = 12, grid = True)
sns.pairplot(data.drop('class', axis=1))
# independant variables
X = data.drop(['class'], axis=1)
XScaled = StandardScaler().fit_transform(X)
# the dependent variable
y = data['class']
# Split data into test and train
X_train, X_test, y_train, y_test = train_test_split(XScaled,y, random_state = 1, stratify = y, test_size=0.30)
print(f'Shape of train data set: {X_train.shape}')
print(f'Shape of test data set: {X_test.shape}')
clf = svm.SVC(gamma=0.025, C=3, kernel='rbf')
clf.fit(X_train , y_train)
y_pred = clf.predict(X_test)
svc_score = clf.score(X_test, y_test)
print(f'SVM accuracy: {svc_score}')
print("Confusion Matrix:\n",metrics.confusion_matrix(y_pred,y_test))
clf1 = svm.SVC(gamma=0.025, C=3, kernel='rbf')
kfold = KFold(n_splits=10, random_state=1)
svc_cross_val_score = model_selection.cross_val_score(clf1, XScaled, y, cv=kfold, scoring='accuracy')
print(svc_cross_val_score)
svc_cross_val_score_avg = round(svc_cross_val_score.mean() * 100.0, 2)
svc_cross_val_score_std = round(svc_cross_val_score.std() * 100.0, 2)
print(f"\nCross validation accuracy is { svc_cross_val_score_avg }%. Stdanard deviation is { svc_cross_val_score_std }%")
pca = PCA()
pca.fit(XScaled)
And the percentage of variation explained by each eigen Vector
print(pca.explained_variance_ratio_)
plt.figure(figsize= (20,15))
plt.subplot(3, 2, 1)
plt.bar(list(range(1,19)),pca.explained_variance_ratio_,alpha=0.5, align='center')
plt.ylabel('Variation explained')
plt.xlabel('eigen Value')
plt.subplot(3, 2, 2)
plt.step(list(range(1,19)),np.cumsum(pca.explained_variance_ratio_), where='mid')
plt.ylabel('Cum of variation explained')
plt.xlabel('eigen Value')
plt.show()
Now 11 dimensions seems very reasonable. With 11 variables we can explain over 99% of the variation in the original data!
pca2 = PCA(n_components=11)
pca2.fit(XScaled)
print(pca2.explained_variance_ratio_)
Xpca2 = pca2.transform(XScaled)
sum(pca2.explained_variance_ratio_)
sns.pairplot(pd.DataFrame(Xpca2))
# Split PCA data into test and train
X_train, X_test, y_train, y_test = train_test_split(Xpca2,y, random_state = 1, stratify = y, test_size=0.30)
print(f'Shape of train data set: {X_train.shape}')
print(f'Shape of test data set: {X_test.shape}')
clf = svm.SVC(gamma=0.025, C=3)
clf.fit(X_train , y_train)
pca_svc_score = clf.score(X_test, y_test)
print(f'SVM accuracy after PCA: {pca_svc_score}')
y_pred = clf.predict(X_test)
print("Confusion Matrix:\n",metrics.confusion_matrix(y_pred,y_test))
clf1 = svm.SVC(gamma=0.025, C=3, kernel='rbf')
kfold = KFold(n_splits=10, random_state=1)
pca_svc_cross_val_score = model_selection.cross_val_score(clf1, Xpca2, y, cv=kfold, scoring='accuracy')
print(pca_svc_cross_val_score)
pca_svc_cross_val_score_avg = round(pca_svc_cross_val_score.mean() * 100.0, 2)
pca_svc_cross_val_score_std = round(pca_svc_cross_val_score.std() * 100.0, 2)
print(f"\nCross validation accuracy is {pca_svc_cross_val_score_avg}%. Stdanard deviation is {pca_svc_cross_val_score_std}%")
print(f'\nSVM score: { round(svc_score * 100.0, 2)}%')
print(f"Cross validation accuracy is { svc_cross_val_score_avg }%. Stdanard deviation is { svc_cross_val_score_std }%")
print(f'\nSVM Score after PCA: {round(pca_svc_score * 100.0, 2)}%')
print(f"Cross validation accuracy is { pca_svc_cross_val_score_avg }%. Stdanard deviation is { pca_svc_cross_val_score_std }%")
Looks like by drop reducing dimensionality by 7, we only dropped around 1% in R^2! This is insample (on training data) and hence a drop in R^2 is expected. Still seems easy to justify the dropping of variables. An out of sample (on test data), with the 11 independent variables is likely to do better since that would be less of an over-fit.